decoded status.

Anon Sricharoenchai Fri, 08 Dec 2000 02:36:12 -0800

As per RFC1738 (http://www.w3.org/Addressing/rfc1738.txt, [Page 3]),
the characters ";", "/", "?", ":", "@", "=" and "&" should not be encoded
or decoded by wget, because they have a special meaning.  Encoding or
decoding these characters may change the semantics of a URL.

For example, "http://abc.xyz/abc?def" and "http://abc.xyz/abc%3Fdef" are
not the same location.  But when invoking the command

        $ wget 'http://abc.xyz/abc%3Fdef'

, wget will change "%3F" to be "?" that will lead to the wrong location.


Also, the "+" character (not listed in RFC), should retain its
encoded/decoded status.  Because it have the different meaning when
appear in a CGI query.  For example,

"http://abc.xyz/abc.cgi?var1=a+b" means that var1 = "a b"
"http://abc.xyz/abc.cgi?var1=a%2Bb" means that var1 = "a+b"


The following is a patch for this bug to wget 1.5.3.


diff -ur wget-1.5.3.orig/src/url.c wget-1.5.3/src/url.c
--- wget-1.5.3.orig/src/url.c   Fri Sep 11 07:23:26 1998
+++ wget-1.5.3/src/url.c        Mon Oct 23 19:49:04 2000
@@ -51,6 +51,12 @@
 /* URL separator (for findurl) */
 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
 
+/* A list of characters reserved for special meaning, as per RFC1738.
+   Encoding or decoding these characters may change the semantics of a URL.
+   '+' was added because "+" and "%2B" have the different meaning when they
+   appear in a cgi query. */
+#define URL_RESERVED ";/?:@=&" "+"
+
 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
    ':' (not listed in RFC) were added because of user/password
    encoding, and \033 for safe printing.  */
@@ -73,6 +79,16 @@
     }                                          \
 } while (0)
 
+#define URL_CLEANSE2(s, url_unsafe) do         \
+{                                              \
+  if (1)                                       \
+    {                                          \
+      char *uc_tmp = encode_string2 (s, url_unsafe);\
+      free (s);                                        \
+      (s) = uc_tmp;                            \
+    }                                          \
+} while (0)
+
 /* Is a directory "."?  */
 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
 /* Is a directory ".."?  */
@@ -184,7 +200,7 @@
    literally.  */
 
 static void
-decode_string (char *s)
+decode_string (char *s, const char *url_reserved)
 {
   char *p = s;
 
@@ -203,6 +219,13 @@
              continue;
            }
          *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
+          if (strchr(url_reserved, *p))
+           {
+             *p = '%';
+              *(s + 1) = toupper(*(s + 1));
+              *(s + 2) = toupper(*(s + 2));
+             continue;
+           }
          s += 2;
        }
     }
@@ -237,6 +260,46 @@
   *p = '\0';
   return res;
 }
+
+char *
+encode_string2 (const char *s, const char *url_unsafe)
+{
+  const char *b;
+  char *p, *res;
+  int i;
+
+  b = s;
+  for (i = 0; *s; s++, i++)
+    {
+      if (*s == '%' && *(s + 1) && *(s + 2)
+          && (ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
+        continue;
+      if (strchr (url_unsafe, *s))
+        i += 2; /* Two more characters (hex digits) */
+    }
+  res = (char *)xmalloc (i + 1);
+  s = b;
+  for (p = res; *s; s++)
+    {
+      if (*s == '%' && *(s + 1) && *(s + 2)
+          && (ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
+        {
+          *p++ = *s;
+          continue;
+        }
+      if (strchr (url_unsafe, *s))
+        {
+          const unsigned char c = *s;
+          *p++ = '%';
+          *p++ = HEXD2ASC (c >> 4);
+          *p++ = HEXD2ASC (c & 0xf);
+        }
+      else
+        *p++ = *s;
+    }
+  *p = '\0';
+  return res;
+}
 
 /* Returns the proto-type if URL's protocol is supported, or
    URLUNKNOWN if not.  */
@@ -471,12 +534,22 @@
   /* Parse the username and password (if existing).  */
   parse_uname (url, &u->user, &u->passwd);
   /* Decode the strings, as per RFC 1738.  */
-  decode_string (u->host);
-  decode_string (u->path);
+  decode_string (u->host, "");
+  // To prevent the case that, "%%32%36" => "%26" (that char(26) == '&') //
+  // that it should be, "%%32%36" => "%2526" //
+  // So, quote the unsafe "%" //
+  {
+    char* path_temp = xstrdup (u->path);
+    URL_CLEANSE2 (path_temp, "%");
+    free (u->path); u->path = (char *)xmalloc (strlen (path_temp) + 8);
+    strcpy (u->path, path_temp);
+    free (path_temp);
+  };
+  decode_string (u->path, URL_RESERVED "%" URL_UNSAFE);
   if (u->user)
-    decode_string (u->user);
+    decode_string (u->user, "");
   if (u->passwd)
-    decode_string (u->passwd);
+    decode_string (u->passwd, "");
   /* Parse the directory.  */
   parse_dir (u->path, &u->dir, &u->file);
   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
@@ -498,7 +571,7 @@
   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
   strcat (u->path, *u->dir ? "/" : "");
   strcat (u->path, u->file);
-  URL_CLEANSE (u->path);
+  URL_CLEANSE2 (u->path, URL_UNSAFE);
   /* Create the clean URL.  */
   u->url = str_url (u, 0);
   return URLOK;
@@ -618,6 +691,8 @@
     return '\0';
 }
 
+#define CLEANDUP2(x, url_unsafe) (1 ? encode_string2 (x, url_unsafe) \
+                                    : xstrdup (x))
 /* Return the URL as fine-formed string, with a proper protocol, port
    number, directory and optional user/password.  If HIDE is non-zero,
    password will be hidden.  The forbidden characters in the URL will
@@ -636,8 +711,8 @@
     return NULL;
   proto_name = sup_protos[i].name;
   host = CLEANDUP (u->host);
-  dir = CLEANDUP (u->dir);
-  file = CLEANDUP (u->file);
+  dir = CLEANDUP2 (u->dir, URL_UNSAFE);
+  file = CLEANDUP2 (u->file, URL_UNSAFE);
   user = passwd = NULL;
   if (u->user)
     user = CLEANDUP (u->user);
@@ -1167,15 +1242,23 @@
     }
   free (host);
 
+  dir = xstrdup (dir);
+  URL_CLEANSE2 (dir, URL_UNSAFE);
   /* If there is a prefix, prepend it.  */
   if (*dirpref)
     {
-      char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
+//      char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
+      char *newdir;
+      dirpref = xstrdup(dirpref);
+      URL_CLEANSE (dirpref);
+      newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
+      free(dir); free(dirpref);
       dir = newdir;
+      dir = xstrdup (dir);
     }
-  dir = xstrdup (dir);
-  URL_CLEANSE (dir);
+//  dir = xstrdup (dir);
+//  URL_CLEANSE (dir);
   l = strlen (dir);
   if (l && dir[l - 1] == '/')
     dir[l - 1] = '\0';
[Cooker] Bug in wget 1.5.3, the characters ";/?=&+" must retain their encoded/decoded status.

Reply via email to